clear
set more off
capture log close
local pathtab1 "/data"
local pathtab2 "/data"

insheet using "`pathtab1'/STATA_RV_7162015-195.csv", clear
label data STATA_RV_7162015_195
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 1995"
label variable instnm "Institution (entity) name"
label variable stabbr "Post Office State abbreviation code"
label variable fips "FIPS State code"
label variable sector "Sector of institution"
label variable opeid "Office of Postsecondary Education id"
label variable zip "ZIP + four"


label define label_sector 0 "Administrative unit"
label values sector label_sector
label define label_sector 1 "Public 4-year or above", add
label values sector label_sector
label define label_sector 2 "Private nonprofit 4-year or above", add
label values sector label_sector
label define label_sector 3 "Private for-profit 4-year or above", add
label values sector label_sector
label define label_sector 4 "Public 2-year", add
label values sector label_sector
label define label_sector 5 "Private nonprofit 2-year", add
label values sector label_sector
label define label_sector 6 "Private for-profit 2-year", add
label values sector label_sector
label define label_sector 7 "Public less-than-2-year", add
label values sector label_sector
label define label_sector 8 "Private nonprofit less-than-2-year", add
label values sector label_sector
label define label_sector 9 "Private for-profit less-than-2-year", add
label values sector label_sector

drop if opeid==-2 | opeid==.
sort opeid

gen opeid_alt=opeid
tostring opeid_alt, replace

replace opeid_alt="0"+opeid_alt+"00" if opeid>9999 & opeid<=99999
replace opeid_alt="00"+opeid_alt if opeid>99999 & opeid<=999999
replace opeid_alt="0"+opeid_alt if opeid>999999 & opeid<=9999999

gen opeid_1=substr(opeid_alt,1,6)
gen opeid_2=substr(opeid_alt,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop opeid_alt

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1
drop temp sumtemp

save "`pathtab1'/ipeds_1995.dta", replace

/*******
1996
********/
insheet using "`pathtab1'/STATA_RV_7162015-276.csv", clear
label data STATA_RV_7162015_276
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 1996"
label variable instnm "Institution (entity) name"
label variable stabbr "Post Office State abbreviation code"
label variable fips "FIPS State code"
label variable sector "Sector of institution"
label variable opeid "Office of Postsecondary Education id"
label variable zip "ZIP + four"


/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_1996.dta", replace

/******
1997
*******/
insheet using "`pathtab1'/STATA_RV_7232015-338.csv", clear
label data STATA_RV_7232015_338
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 1997"
label variable instnm "Institution name"
label variable stabbr "Post Office State abbreviation code"
label variable fips "FIPS State code"
label variable idx "UNITID of parent institution reporting full-year enrollment"
label variable sector "Sector of institution"
label variable opeid "Office of Postsecondary Education id"
label variable zip "ZIP + four"


/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_1997.dta", replace


/*******
1998
********/
insheet using "`pathtab1'/STATA_RV_7162015-95.csv", clear
label data STATA_RV_7162015_95
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 1998"
label variable instnm "Institution (entity) name"
label variable stabbr "Post Office State abbreviation code"
label variable zip "ZIP + four"
label variable fips "FIPS State code"
label variable opeid "Office of Postsecondary Education ID"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_1998.dta", replace

/*******
1999
********/
insheet using "`pathtab1'/STATA_RV_7162015-784.csv", clear
label data STATA_RV_7162015_784
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 1999"
label variable instnm "Institution (entity) name"
label variable stabbr "Post Office State abbreviation code"
label variable zip "ZIP + four"
label variable fips "FIPS State code"
label variable opeid "Office of Postsecondary Education ID"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_1999.dta", replace

/******
2000
*******/
insheet using "`pathtab1'/STATA_RV_7162015-851.csv", clear
label data STATA_RV_7162015_851
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2000"
label variable instnm "Institution (entity) name"
label variable stabbr "Post Office State abbreviation code"
label variable zip "ZIP + four"
label variable fips "FIPS State code"
label variable opeid "Office of Postsecondary Education ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2000.dta", replace

/******
2001
*******/
insheet using "`pathtab1'/STATA_RV_7162015-292.csv", clear
label data STATA_RV_7162015_292
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2001"
label variable instnm "Institution (entity) name"
label variable stabbr "Post Office State abbreviation code"
label variable zip "ZIP + four"
label variable fips "FIPS State code"
label variable opeid "Office of Postsecondary Education ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2001.dta", replace

/********
2002
*********/
insheet using "`pathtab1'/STATA_RV_7162015-45.csv", clear
label data STATA_RV_7162015_45
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2002"
label variable instnm "Institution (entity) name"
label variable stabbr "USPS state abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)

drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2002.dta", replace

/******
2003
*******/
insheet using "`pathtab1'/STATA_RV_7162015-627.csv", clear
label data STATA_RV_7162015_627
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2003"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2003.dta", replace

/*******
2004
********/
insheet using "`pathtab1'/STATA_RV_7162015-86.csv", clear
label data STATA_RV_7162015_86
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2004"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp


save "`pathtab1'/ipeds_2004.dta", replace

/******
2005
*******/
insheet using "`pathtab1'/STATA_RV_7162015-105.csv", clear
label data STATA_RV_7162015_105
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2005"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"


/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp


save "`pathtab1'/ipeds_2005.dta", replace

/*****
2006
******/
insheet using "`pathtab1'/STATA_RV_7162015-124.csv", clear
label data STATA_RV_7162015_124
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2006"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"


/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2006.dta", replace

/******
2007
*******/
insheet using "`pathtab1'/STATA_RV_7162015-128.csv", clear
label data STATA_RV_7162015_128
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2007"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp


save "`pathtab1'/ipeds_2007.dta", replace

/******
2008
*******/
insheet using "`pathtab1'/STATA_RV_7162015-238.csv", clear
label data STATA_RV_7162015_238
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2008"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2008.dta", replace

/*******
2009
*******/
insheet using "`pathtab1'/STATA_RV_7162015-257.csv", clear
label data STATA_RV_7162015_257
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2009"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2009.dta", replace

/******
2010
*******/
insheet using "`pathtab1'/STATA_RV_7162015-729.csv", clear
label data STATA_RV_7162015_729
label variable unitid "UNITID"
label variable instnm "Institution Name"
label variable year "Survey year 2010"
label variable instnm "Institution (entity) name"
label variable stabbr "State abbreviation"
label variable zip "ZIP code"
label variable fips "FIPS state code"
label variable opeid "Office of Postsecondary Education (OPE) ID Number"
label variable sector "Sector of institution"

/*Fixing duplicates in opeid and expressing it as in the other datesets to merge*/
drop if opeid=="" | opeid=="99999999" | opeid=="-2"
sort opeid

gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]

drop if sector==0 & temp==1
drop temp

sort opeid zip
gen temp=1 if opeid==opeid[_n-1] | opeid==opeid[_n+1]
by opeid: gen sumtemp=sum(temp)
drop if temp==1 & sumtemp>1

gen opeid_1=substr(opeid,1,6)
gen opeid_2=substr(opeid,7,2)
drop opeid
gen opeid=opeid_1+"-"+opeid_2 if opeid_1~=""
drop opeid_1 opeid_2 
drop temp sumtemp

save "`pathtab1'/ipeds_2010.dta", replace

use "`pathtab1'/ipeds_1995.dta", clear
forvalues i=1996(1)2010 {
append using "`pathtab1'/ipeds_`i'.dta", force
}
drop v4 zip 

rename year year_enroll
rename sector first_col_sector
rename stabbr first_col_state
rename fips first_col_state_fips
rename opeid first_col_opeid_nslds
rename instnm first_col_instnm_nslds

sort first_col_opeid_nslds year_enroll

save "`pathtab2'/ipeds_state.dta", replace

gen first_col_opeid6_nslds=substr(first_col_opeid_nslds,1,6)
gen first_col_opeid2_nslds=substr(first_col_opeid_nslds,8,2)

/*Now I will consider a dataset where I just use the first 6 digit of opeid,
for those cases I wasn't able to merge using the 8 digit opeid*/
sort year_enroll first_col_opeid6_nslds
egen group=group(year_enroll first_col_opeid6_nslds)
gen temp=1 if group==group[_n-1] | group==group[_n+1]

drop if temp~=1
drop temp
gen temp=1 if first_col_opeid2_nslds=="00"

bys group: egen sumtemp=sum(temp)

gen temp2=1 if (first_col_opeid2_nslds=="01" | first_col_opeid2_nslds=="A1") & sumtemp==0

bys group: egen sumtemp2=sum(temp2)

keep if (sumtemp==1 & temp==1) | (sumtemp2==1 & temp2==1)

drop first_col_instnm_nslds
keep year_enroll first_col_state first_col_state_fips first_col_sector first_col_opeid_nslds first_col_opeid6_nslds

save "`pathtab2'/ipeds_state_opeid6.dta", replace

forvalues i=1995(1)2010 {
erase "`pathtab1'/ipeds_`i'.dta"
}

/*It could be the case that I have information for a school, but not for a particular
year, so I'll create a dataset to merge at the end based just on opeid's*/

use "`pathtab2'/ipeds_state.dta", clear

sort first_col_opeid_nslds year_enroll
by first_col_opeid_nslds: keep if _n==1
drop year_enroll unitid first_col_instnm_nslds idx

save "`pathtab2'/ipeds_state_noyear.dta", replace

